# -*- coding: utf-8 -*-

import os
import sys
from datetime import datetime as dt
import json
import feedparser
from bs4 import BeautifulSoup

# Example feed:
# http://feeds.feedburner.com/oreilly/radar/atom
#FEED_URL = sys.argv[1]
FEED_URL ="http://feed.cnblogs.com/blog/sitehome/rss"

def cleanHtml(html):
    return BeautifulSoup(html).get_text()


fp = feedparser.parse(FEED_URL)

print("Fetched %s entries from '%s'" % (len(fp.entries[0].title), fp.feed.title))

blog_posts = []
for e in fp.entries:
    print(e.content[0].value)
    blog_posts.append({'title': e.title, 'content'
                      : cleanHtml(e.content[0].value), 'link': e.links[0].href})

if not os.path.isdir('out'):
    os.mkdir('out')

out_file = '../data/%s__%s.json' % (fp.feed.title, dt.utcnow().strftime('%Y-%m-%d'))
f = open(out_file, 'w')
f.write(json.dumps(blog_posts, ensure_ascii=False))
f.close()
